/*
 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.tribuo.data.text.impl;

import com.oracle.labs.mlrg.olcut.config.Config;
import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
import org.tribuo.Feature;
import org.tribuo.data.text.FeatureAggregator;
import org.tribuo.data.text.FeatureTransformer;
import org.tribuo.data.text.TextPipeline;
import org.tribuo.data.text.TextProcessingException;
import org.tribuo.data.text.TextProcessor;
import org.tribuo.util.tokens.Tokenizer;

import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * A pipeline for generating ngram features.
 */
public class TokenPipeline implements TextPipeline {

    private static final Logger logger = Logger.getLogger(TokenPipeline.class.getName());

    private List<TextProcessor> processors = new ArrayList<>();
    private List<FeatureTransformer> transformers = new ArrayList<>();
    private FeatureAggregator aggregator;

    @Config(mandatory = true,description="Use term counting, otherwise emit binary features.")
    private boolean termCounting;

    @Config(description="Dimension to map the hash into.")
    private int hashDim = -1;

    @Config(mandatory = true,description="Tokenizer to use.")
    private Tokenizer tokenizer;

    @Config(description="n in the n-gram to emit.")
    private int ngram = 2;

    /**
     * Creates a new token pipeline.
     * 
     * @param tokenizer The tokenizer to use to split up the text into words (i.e., 
     * features.)
     * @param ngram The maximum size of ngram features to add to the features
     * generated by the pipeline. A value of {@code n} means that ngram features
     * of size 1-n will be generated. A good standard value to use is 2, which means
     * that unigram and bigram features will be generated. You will very likely see
     * diminishing returns for larger values of {@code n} but there will be times
     * when they will be necessary.
     * @param termCounting If {@code true}, multiple occurrences of terms
     * in the document will be counted and the count will be used as the value
     * of the features that are produced.
     */
    public TokenPipeline(Tokenizer tokenizer, int ngram, boolean termCounting) {
        this(tokenizer, ngram, termCounting, -1);
    }
    
    /**
     * Creates a new token pipeline.
     *
     * @param tokenizer The tokenizer to use to split up the text into words
     * (i.e., features.)
     * @param ngram The maximum size of ngram features to add to the features
     * generated by the pipeline. A value of {@code n} means that ngram
     * features of size 1-n will be generated. A good standard value to use is
     * 2, which means that unigram and bigram features will be generated. You
     * will very likely see diminishing returns for larger values of
     * {@code n} but there will be times when they will be necessary.
     * @param termCounting If {@code true}, multiple occurrences of terms
     * in the document will be counted and the count will be used as the value
     * of the features that are produced.
     * @param dimension The maximum dimension for the feature space. If this value 
     * is greater than 0, then at most {@code dimension} features will be
     * through the use of a hashing function that will collapse the feature 
     * space.
     */
    public TokenPipeline(Tokenizer tokenizer, int ngram, boolean termCounting, int dimension) {
        this.tokenizer = tokenizer;
        this.ngram = ngram;
        this.hashDim = dimension;
        this.termCounting = termCounting;
        postConfig();
    }

    /**
     * For olcut.
     */
    private TokenPipeline() {}

    @Override
    public void postConfig() {
        for (int i = 1; i <= ngram; ++i) {
            processors.add(new NgramProcessor(tokenizer,i,1));
        }
        if (hashDim > 0) {
            transformers.add(new FeatureHasher(hashDim));
        }
        if (termCounting) {
            aggregator = new SumAggregator();
        } else {
            aggregator = new UniqueAggregator(1);
        }
    }

    @Override
    public String toString() {
        if (transformers.size() > 0) {
            return ngram + "gramPipeline({1.."+ngram+"}-grams,hashing)";
        } else {
            return ngram + "gramPipeline({1.."+ngram+"}-grams)";
        }
    }

    @Override
    public List<Feature> process(String tag, String data) {
        List<Feature> features = new ArrayList<>();

        for (TextProcessor p : processors) {
            try {
                features.addAll(p.process(tag,data));
            } catch (TextProcessingException e) {
                logger.log(Level.INFO, String.format("TextProcessingException thrown by processor %s with text %s",p,data), e);
            }
        }

        for (FeatureTransformer transformer: transformers) {
            features = transformer.map(tag,features);
        }

        return aggregator.aggregate(features);
    }

    @Override
    public ConfiguredObjectProvenance getProvenance() {
        return new ConfiguredObjectProvenanceImpl(this,"TextPipeline");
    }

}
